Load and normalize the data

The miRNA and mRNA raw count are normalized and filtered for low expression. For mRNA the genes with the most variance are kept.

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                   Load count data and normalization                        ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

# miRNA data
pathCount = "~/Documents/2.Omics_analysis/1.2021_Progeria/1.Data/miRNA/mature_sense.tsv"
miRNA_count = read.csv(pathCount, sep = ",", row.names = 1) # miRNA count
miRNA_count = as.matrix(miRNA_count)
order = c(
  "X2021A53_R1",
  "X2021A54_R1",
  "X2021A58_R1",
  "X2021A59_R1",
  "X2021A55_R1",
  "X2021A56_R1",
  "X2021A57_R1",
  "X2021A60_R1",
  "X2021A61_R1",
  "X2021A62_R1"
) # Order of the samples
miRNA_count = miRNA_count[, order]
colnames(miRNA_count) = c(
  "WTP8_1",
  "WTP8_2",
  "WTP12_1",
  "WTP12_2",
  "MutantP8_1",
  "MutantP8_2",
  "MutantP8_3",
  "MutantP12_1",
  "MutantP12_2",
  "MutantP12_3"
)

# mRNA data, I removed the two sample taht had a batch effect as they are not included in the miRNA data
pathCount = "~/Documents/2.Omics_analysis/1.2021_Progeria/1.Data/mRNA/gene_count_matrix.tsv"
mRNA_count = as.matrix(read.csv(pathCount, sep = ",", row.names = "gene_id")) # Gene count
order = c(
  "X2021A40",
  "X2021A41",
  "X2021A92",
  "X2021A45",
  "X2021A46",
  "X2021A93",
  "X2021A42",
  "X2021A43",
  "X2021A44",
  "X2021A47",
  "X2021A48",
  "X2021A49"
) # Order of the samples
mRNA_count = mRNA_count[, order]
colnames(mRNA_count) = c(
  "WTP8_1",
  "WTP8_2",
  "WTP8_3",
  "WTP12_1",
  "WTP12_2",
  "WTP12_3",
  "MutantP8_1",
  "MutantP8_2",
  "MutantP8_3",
  "MutantP12_1",
  "MutantP12_2",
  "MutantP12_3"
)
SYMBOL_ID = gsub(".*\\|", "", rownames(mRNA_count)) # Name of the genes (SYMBOL notation)
ENS_ID = gsub("\\..*", "",rownames(mRNA_count)) # Name of the genes (ENSEMBL notation)
gene_names = ENS_ID
names(gene_names) = SYMBOL_ID
row.names(mRNA_count) = SYMBOL_ID

mRNA_count = mRNA_count[, c(
  "WTP8_1",
  "WTP8_2",
  "WTP12_1",
  "WTP12_2",
  "MutantP8_1",
  "MutantP8_2",
  "MutantP8_3",
  "MutantP12_1",
  "MutantP12_2",
  "MutantP12_3"
)]

# Data information (same for mi and mRNA)
coldata = data.frame(
  Sample = colnames(mRNA_count),
  Genotype = c(
    "WT",
    "WT",
    "WT",
    "WT",
    "Mutant",
    "Mutant",
    "Mutant",
    "Mutant",
    "Mutant",
    "Mutant"
  ),
  Passage = c("P8", "P8", "P12", "P12",
              "P8", "P8", "P8", "P12", "P12", "P12"),
  Genotype_Passage = c(
    "WTP8",
    "WTP8",
    "WTP12",
    "WTP12",
    "MutantP8",
    "MutantP8",
    "MutantP8",
    "MutantP12",
    "MutantP12",
    "MutantP12"
  ),
  row.names = TRUE
)

# Desq object for normalisation and vst tranformation recomanded by MOFA. I also filtered for lowly expressed genes
dds_miRNA_WTvsMut = DESeqDataSetFromMatrix(countData = miRNA_count,
                                           colData = coldata,
                                           design = ~ Genotype)
keep = (rowSums(counts(dds_miRNA_WTvsMut))) >= 10
dds_miRNA_WTvsMut = dds_miRNA_WTvsMut[keep, ]
dds_miRNA_WTvsMut = estimateSizeFactors(dds_miRNA_WTvsMut) ##
vsd = varianceStabilizingTransformation(dds_miRNA_WTvsMut, blind = FALSE) # blind = FALSE to take in account the experimental design
miRNA_norm = assay(vsd)


dds_mRNA_WTvsMut = DESeqDataSetFromMatrix(countData = mRNA_count,
                                          colData = coldata,
                                          design = ~ Genotype)
keep = (rowSums(counts(dds_mRNA_WTvsMut))) >= 10
dds_mRNA_WTvsMut = dds_mRNA_WTvsMut[keep, ]
dds_mRNA_WTvsMut <- estimateSizeFactors(dds_mRNA_WTvsMut) ##
vsd = varianceStabilizingTransformation(dds_mRNA_WTvsMut, blind = FALSE) # blind = FALSE to take in account the experimental design

var = as.data.frame(apply(assay(vsd), 1, var))
colnames(var) = "x"
keep2 = rownames(filter(var, x > max(x)/150))
mRNA_norm = assay(vsd)[keep2, ]
# Matrix containing the data
data = list("mRNA" = mRNA_norm, "miRNA" = miRNA_norm)

Run MOFA

MOFA is a factor analysis model that provides a general framework for the integration of multi-omic data sets in an unsupervised fashion. Intuitively, MOFA can be viewed as a versatile and statistically rigorous generalization of principal component analysis to multi-omics data. Given several data matrices with measurements of multiple -omics data types on the same or on overlapping sets of samples, MOFA infers an interpretable low-dimensional representation in terms of a few latent factors. These learnt factors represent the driving sources of variation across data modalities, thus facilitating the identification of cellular states or disease subgroups.

Create MOFA object

The MOFA object contains the miRNA and mRNA count data. It is important to note that the initial mRNA data set after filtering for lowly expressed genes contained approximately 30 000 genes for only 740 miRNA. Knowing that I filtered the data to only keep ~ 7000 genes that had the most important variance between samples.

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Create the MOFA object.                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
MOFAobject = create_mofa(data)
#Plot overview of the input data, including the number of samples, views, features, and the missing assays.
plot_data_overview(MOFAobject)

MOFA options

To run the analysis, we have to define the options, there classified in 3 categories, for each we’ll define the following options :

  • Data options :

    • scale_views: whether to scale views to the same total variance? Default is FALSE (if views have different ranges/variances, it is good practice to scale each view to unit variance.)

    • scale_groups: scale groups to the same total variance? (FALSE by Default)

  • Model options :

    • likelihoods: likelihood per view (options are “gaussian”, “poisson”, “bernoulli”). For RNA seq data we will use the gaussian one.

    • num_factors: number of factors, given the small number of sample, we will set it to 5. Those factors saprated the data (genes and mRNA) in set that explain the differences between samples. With more samples, we could use more features.

  • Training options :

    • maxiter: maximum number of iterations, set to 5000

    • convergence_mode: “fast”, “medium”, “slow”. For a througought exploration I set it to slow mode.

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                                 MOFA options                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
data_opts = get_default_data_options(MOFAobject)
model_opts = get_default_model_options(MOFAobject)
train_opts = get_default_training_options(MOFAobject)
train_opts$convergence_mode = "slow"
train_opts$maxiter = 5000
#data_opts
#model_opts
#train_opts

Visual inspection of RNA-seq centered/normalized counts data

hist(MOFAobject@data$mRNA$group1 ,
     breaks = 100,
     main = "Histogram of mRNA count values stored in MOFA object")

hist(MOFAobject@data$miRNA$group1 ,
     breaks = 100,
     main = "Histogram of miRNA count values stored in MOFA object")

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Prepare the MOFA object                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
MOFAobject = prepare_mofa(
  MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                                  Run MOFA                                  ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

MOFAobject = run_mofa(MOFAobject, outfile = "MOFA2.hdf5", use_basilisk = TRUE) 

Results interpretation

In this section we will go over the results of the analysis and explain what they mean. Fist of all here is a table that summarize the information of our data.

#Add sample metadata to the model
samples_metadata(MOFAobject) = data.frame(
  sample = colnames(mRNA_count),
  Genotype = c("WT","WT","WT","WT","Mutant", "Mutant", 
               "Mutant", "Mutant", "Mutant", "Mutant"),
  Passage = c("P8", "P8", "P12", "P12",
              "P8", "P8", "P8", "P12", "P12", "P12")
)
samples_metadata(MOFAobject)

Results

General factors plots

Factors correlation plot

There is no correlation between the factor, which is ideal.

plot_factor_cor(MOFAobject)

Variance decomposition

This figures represent the percentage of variance explained by factors in each view (miRNA or mRNA).

  • Noisy datasets will result in a lower amounts of variance explained.
  • Higher the number of samples –> smaller the total variance explained.
  • Higher the number of factors –> higher the total variance explained.

In this data set, using only K=5 factors the model explains up to ~80% of the variation in the miRNA data and ~75% of the variation in the mRNA seq data. The latent factors inferred by MOFA represent the underlying principal sources of variation among the samples.

In this plot we can see the variance per factor, in each view.

plot_variance_explained(MOFAobject, max_r2= 68)

Here is the total variance per view.

plot_variance_explained(MOFAobject, plot_total = T)[[2]]

#MOFAobject@cache$variance_explained$r2_per_factor

Summary plot of factors

We can see on the following plot that factor 1 separates the data according to the genotypes of the samples. This separation is less clear with other factors. It seems that no factor can clearly separate the samples according to the passages.

plot_factor(
  MOFAobject,
  factor = 1:MOFAobject@dimensions$K,
  color_by = "Genotype",
  shape_by = "Passage"
)

Combinations of factors

Here we can visualize with more detailed the combination of multiple factors and how they separate the samples (according to their genotype (top) or passage (bottom)). The conclusion are the same.

plot_factors(MOFAobject,
             factors = 1:MOFAobject@dimensions$K,
             color_by = "Genotype")

plot_factors(MOFAobject,
             factors = 1:MOFAobject@dimensions$K,
             color_by = "Passage")

Now we will explore each factor separatly.

Exploration of factor 1

Table of weights/factors

Zoom on top 10 absolute weight for factor 1. We plot the 10 genes (top) and miRNA (bottom) having the highest absolute weight. The weights mesure how strong each feature/gene relates to each factor (no association = 0 ; strong association = high absolute values). The sign of the weight gives the direction of the effect (e.g. a positive weight indicates that the gene has higher levels in the samples with positive factor values).

plot_top_weights(MOFAobject,
                 view = "mRNA",
                 factor = 1,
                 nfeatures = 10)

plot_top_weights(MOFAobject,
                 view = "miRNA",
                 factor = 1,
                 nfeatures = 10)

Data heatmap

The heatmaps allows us to visualize the expression level of genes (top) and miRNA (bottom) involved in factor 1. The 150 RNA with the highest weights are plotted.

n = 150
plot_data_heatmap(
  MOFAobject,
  view = "mRNA",
  factor = 1,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

plot_data_heatmap(
  MOFAobject,
  view = "miRNA",
  factor = 1,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

We can see a clear separation between WT and mutant.

Correlation with DESeq data

In this plot, the p_value from the differential analysis of wt vs mutant cells, is ploted against the weights obtained using MOFA. The red line represent the p_value threshold (0.01) and the

f = 1
mRNA_weights = get_weights(MOFAobject, views = "mRNA", factors = f, as.data.frame = T)
row.names(mRNA_weights) = mRNA_weights$feature
DEG = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/2.mRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

mRNA_weights$pvalue = NA
mRNA_weights$l2fc =  NA
for (i in rownames(mRNA_weights)){
  gene = gene_names[i]
  mRNA_weights[i,]$pvalue = DEG[gene,]$pvalue
  mRNA_weights[i,]$l2fc = DEG[gene,]$log2FoldChange
}

ggplot(mRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each gene weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(mRNA_weights[order(mRNA_weights$pvalue),], pvalue < 0.01)[1:20,],
    aes(label = feature),
    size = 3) +
  theme_minimal()

miRNA_weights = get_weights(MOFAobject, views = "miRNA", factors = f, as.data.frame = T)
row.names(miRNA_weights) = miRNA_weights$feature
DEmiR = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/4.miRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

miRNA_weights$pvalue = NA
miRNA_weights$l2fc =  NA
for (i in rownames(miRNA_weights)){
  miRNA_weights[i,]$pvalue = DEmiR[i,]$pvalue
  miRNA_weights[i,]$l2fc = DEmiR[i,]$log2FoldChange
}

ggplot(miRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each miRNA weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(miRNA_weights[order(miRNA_weights$pvalue),], pvalue < 0.01)[1:20,],
    aes(label = feature),
    size = 3)+
  theme_minimal()

Exploration of factor 2

Table of weights/factors

plot_top_weights(MOFAobject,
                 view = "mRNA",
                 factor = 2,
                 nfeatures = 10)

plot_top_weights(MOFAobject,
                 view = "miRNA",
                 factor = 2,
                 nfeatures = 10)

Data heatmap

n = 750
plot_data_heatmap(
  MOFAobject,
  view = "mRNA",
  factor = 2,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

plot_data_heatmap(
  MOFAobject,
  view = "miRNA",
  factor = 2,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

Correlation with DESeq data

f = 2
mRNA_weights = get_weights(MOFAobject, views = "mRNA", factors = f, as.data.frame = T)
row.names(mRNA_weights) = mRNA_weights$feature
DEG = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/2.mRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

mRNA_weights$pvalue = NA
mRNA_weights$l2fc =  NA
for (i in rownames(mRNA_weights)){
  gene = gene_names[i]
  mRNA_weights[i,]$pvalue = DEG[gene,]$pvalue
  mRNA_weights[i,]$l2fc = DEG[gene,]$log2FoldChange
}

ggplot(mRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each gene weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(mRNA_weights[order(mRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3) +
  theme_minimal()

miRNA_weights = get_weights(MOFAobject, views = "miRNA", factors = f, as.data.frame = T)
row.names(miRNA_weights) = miRNA_weights$feature
DEmiR = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/4.miRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

miRNA_weights$pvalue = NA
miRNA_weights$l2fc =  NA
for (i in rownames(miRNA_weights)){
  miRNA_weights[i,]$pvalue = DEmiR[i,]$pvalue
  miRNA_weights[i,]$l2fc = DEmiR[i,]$log2FoldChange
}

ggplot(miRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each miRNA weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(miRNA_weights[order(miRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3)+
  theme_minimal()

Exploration of factor 3

Table of weights/factors

plot_top_weights(MOFAobject,
                 view = "mRNA",
                 factor = 3,
                 nfeatures = 10)

plot_top_weights(MOFAobject,
                 view = "miRNA",
                 factor = 3,
                 nfeatures = 10)

Data heatmap

n = 750
plot_data_heatmap(
  MOFAobject,
  view = "mRNA",
  factor = 3,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

plot_data_heatmap(
  MOFAobject,
  view = "miRNA",
  factor = 3,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

Correlation with DESeq data

f = 3
mRNA_weights = get_weights(MOFAobject, views = "mRNA", factors = f, as.data.frame = T)
row.names(mRNA_weights) = mRNA_weights$feature
DEG = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/2.mRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

mRNA_weights$pvalue = NA
mRNA_weights$l2fc =  NA
for (i in rownames(mRNA_weights)){
  gene = gene_names[i]
  mRNA_weights[i,]$pvalue = DEG[gene,]$pvalue
  mRNA_weights[i,]$l2fc = DEG[gene,]$log2FoldChange
}

ggplot(mRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each gene weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(mRNA_weights[order(mRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3) +
  theme_minimal()

miRNA_weights = get_weights(MOFAobject, views = "miRNA", factors = f, as.data.frame = T)
row.names(miRNA_weights) = miRNA_weights$feature
DEmiR = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/4.miRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

miRNA_weights$pvalue = NA
miRNA_weights$l2fc =  NA
for (i in rownames(miRNA_weights)){
  miRNA_weights[i,]$pvalue = DEmiR[i,]$pvalue
  miRNA_weights[i,]$l2fc = DEmiR[i,]$log2FoldChange
}

ggplot(miRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each miRNA weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(miRNA_weights[order(miRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3)+
  theme_minimal()

Exploration of factor 4

Table of weights/factors

plot_top_weights(MOFAobject,
                 view = "mRNA",
                 factor = 4,
                 nfeatures = 10)

plot_top_weights(MOFAobject,
                 view = "miRNA",
                 factor = 4,
                 nfeatures = 10)

Data heatmap

n = 750
plot_data_heatmap(
  MOFAobject,
  view = "mRNA",
  factor = 4,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

plot_data_heatmap(
  MOFAobject,
  view = "miRNA",
  factor = 4,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

Correlation with DESeq data

f = 4
mRNA_weights = get_weights(MOFAobject, views = "mRNA", factors = f, as.data.frame = T)
row.names(mRNA_weights) = mRNA_weights$feature
DEG = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/2.mRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

mRNA_weights$pvalue = NA
mRNA_weights$l2fc =  NA
for (i in rownames(mRNA_weights)){
  gene = gene_names[i]
  mRNA_weights[i,]$pvalue = DEG[gene,]$pvalue
  mRNA_weights[i,]$l2fc = DEG[gene,]$log2FoldChange
}

ggplot(mRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each gene weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(mRNA_weights[order(mRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3) +
  theme_minimal()

miRNA_weights = get_weights(MOFAobject, views = "miRNA", factors = f, as.data.frame = T)
row.names(miRNA_weights) = miRNA_weights$feature
DEmiR = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/4.miRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

miRNA_weights$pvalue = NA
miRNA_weights$l2fc =  NA
for (i in rownames(miRNA_weights)){
  miRNA_weights[i,]$pvalue = DEmiR[i,]$pvalue
  miRNA_weights[i,]$l2fc = DEmiR[i,]$log2FoldChange
}

ggplot(miRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each miRNA weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(miRNA_weights[order(miRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3)+
  theme_minimal()

Exploration of factor 5

Table of weights/factors

plot_top_weights(MOFAobject,
                 view = "mRNA",
                 factor = 5,
                 nfeatures = 10)

plot_top_weights(MOFAobject,
                 view = "miRNA",
                 factor = 5,
                 nfeatures = 10)

Data heatmap

n = 750
plot_data_heatmap(
  MOFAobject,
  view = "mRNA",
  factor = 5,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

plot_data_heatmap(
  MOFAobject,
  view = "miRNA",
  factor = 5,
  features = n,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  show_rownames = FALSE,
  show_colnames = TRUE,
  scale = "row"
)

Correlation with DESeq data

f = 5
mRNA_weights = get_weights(MOFAobject, views = "mRNA", factors = f, as.data.frame = T)
row.names(mRNA_weights) = mRNA_weights$feature
DEG = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/2.mRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

mRNA_weights$pvalue = NA
mRNA_weights$l2fc =  NA
for (i in rownames(mRNA_weights)){
  gene = gene_names[i]
  mRNA_weights[i,]$pvalue = DEG[gene,]$pvalue
  mRNA_weights[i,]$l2fc = DEG[gene,]$log2FoldChange
}

ggplot(mRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each gene weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(mRNA_weights[order(mRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3) +
  theme_minimal()

miRNA_weights = get_weights(MOFAobject, views = "miRNA", factors = f, as.data.frame = T)
row.names(miRNA_weights) = miRNA_weights$feature
DEmiR = read.csv("/home/nadine/Documents/2.Omics_analysis/1.2021_Progeria/4.miRNA_Analysis/DEG Results/2021Progeria_Mutant_vs_WT_alpha0.01.csv", header = TRUE, sep = ";", row.names = 1)

miRNA_weights$pvalue = NA
miRNA_weights$l2fc =  NA
for (i in rownames(miRNA_weights)){
  miRNA_weights[i,]$pvalue = DEmiR[i,]$pvalue
  miRNA_weights[i,]$l2fc = DEmiR[i,]$log2FoldChange
}

ggplot(miRNA_weights, aes(x = value, y = -log10(pvalue))) + 
  geom_point(aes(color = l2fc), size=3) +
  scale_colour_gradient2(midpoint = 0, low = "darkred", mid = "white", high = "darkgreen") +
  geom_hline(yintercept = -log10(0.01), color = "red") +
  ggtitle(paste("Factor", f ,"repartition of each miRNA weigh against it DESeq2 pvalue")) +
  xlab("MOFA weight") + 
  geom_text_repel(
    data = subset(miRNA_weights[order(miRNA_weights$pvalue),], pvalue < 0.05)[1:20,],
    aes(label = feature),
    size = 3)+
  theme_minimal()

Functional enrichement

In this section I will perform a gene set enrichment analysis (so not on the miRNA) using the GO database in three cases : considering only the genes with negative weight, positive and all. I will generate 3 plots for each analysis :

  • Plot_enrichment_heatmap: plot a heatmap of gene sets (rows) versus factors (columns) where each entry corresponds to the log p-value. This is useful to get an overview on which factors show statistically enriched GO terms.

  • Plot_enrichment: plot the top significant GO terms for a specific factor.

  • Plot_enrichment_detailed: plot a detailed output, highlighting the genes that are contributing to the enrichment of each GO terms.

Results of the enrichement

# C2: curated gene sets from online pathway databases, publications in PubMed, and knowledge of domain experts.
# data("MSigDB_v6.0_C2_mouse") 

# C5: extracted from the Gene Ontology data.base
data("MSigDB_v6.0_C5_mouse") 

features_names(MOFAobject)[["mRNA"]] <- toupper(features_names(MOFAobject)[["mRNA"]])

# Enrichment using the GO databases
enrichment_GO_neg = run_enrichment(
  MOFAobject,
  view = "mRNA",
  feature.sets = MSigDB_v6.0_C5_mouse,
  sign = "negative")

enrichment_GO_pos = run_enrichment(
  MOFAobject,
  view = "mRNA",
  feature.sets = MSigDB_v6.0_C5_mouse,
  sign = "positive")

enrichment_GO_all = run_enrichment(
  MOFAobject,
  view = "mRNA",
  feature.sets = MSigDB_v6.0_C5_mouse,
  sign = "all")

All genes

plot_enrichment_heatmap(enrichment_GO_all)

Most of the enrichment GO terms seem to be in factor 1 so we will plot the top enriched term of this factor, and a detail of the genes that contribute to it.

g = 6
p = 8
n = 15
plot_enrichment(enrichment_GO_all,
                factor = 1,
                max.pathways = n)

plot_enrichment_detailed(
  enrichment_GO_all,
  factor = 1,
  max.genes = g,
  max.pathways = p)

Genes whith negative weights

plot_enrichment_heatmap(enrichment_GO_neg)

Most of the enrichment GO terms seem to be in factor 1 so we will plot the top enriched term of this factor, and a detail of the genes that contribute to it.

plot_enrichment(enrichment_GO_neg,
                factor = 1,
                max.pathways = n)

plot_enrichment_detailed(
  enrichment_GO_neg,
  factor = 1,
  max.genes = g,
  max.pathways = p
)

Genes whith negative weights

plot_enrichment_heatmap(enrichment_GO_pos)

When only considering genes with positive weight all the factors contains enriched GO terms.

for (i in 1:5){
  print(paste("Factor :", i))
  print(plot_enrichment(enrichment_GO_pos,
                factor = i,
                max.pathways = n))
  
  print(plot_enrichment_detailed(
    enrichment_GO_pos,
    factor = i,
    max.genes = g,
    max.pathways = p))}
## [1] "Factor : 1"

## [1] "Factor : 2"

## [1] "Factor : 3"

## [1] "Factor : 4"

## [1] "Factor : 5"